rm(list = ls(all = TRUE))
#library(quanteda)
#library(dplyr)
#library(stm)
#library(ggplot2)
#library(tidytext)

# DATA AND PATHS
# --------------
load("./generated_data/1-corpus_and_wfm.RData")

# Creating corpus for all speeches
debates <- corpus(speakers, text_field = "speech") 

toks <- tokens(debates, what = "word",
               remove_numbers = TRUE, 
               remove_punct = TRUE,
               remove_symbols = TRUE,
               remove_twitter = TRUE,
               remove_hyphens = TRUE,
               remove_url = TRUE,
               verbose = TRUE)

debates.dfm <- dfm(toks, 
                   groups = c("debate.year", "memberID"),
                   tolower = TRUE,
                   remove=stopwords("english"),
                   stem=TRUE, 
                   verbose = TRUE)

# trim
dfm.trim <- dfm_trim(debates.dfm, min_termfreq = 1, min_docfreq = 2)


# remove documents with less/equal than k words because of interruptions
# that are being combined into a document but are not real speeches
k <- 50
dfm.small <- dfm.trim[rowSums(dfm.trim) > k,]

# Reducing DFM to the same cases as regression estimation file with unique portfolios (removing joint portfolio holding)
est_data <- haven::read_dta("./generated_data/7-data_for_regression_models.dta")

est_data <- est_data[!duplicated(est_data$yearMemberID), ]

est_data <- as.data.frame(est_data[order(est_data$yearMemberID),])

dfm_common <-  subset(dfm.small, dfm.small@Dimnames$docs %in% est_data$yearMemberID)

dfm_common <- dfm_common[order(dfm_common@Dimnames$docs),]

#Wordcloud figure
pdf("./plots/appendix_figure5.5-wordcloud.pdf", width = 10, height = 7)
set.seed(10)
textplot_wordcloud(dfm_common, min_size = 0.5, max_size = 4, min_count = 3,
  max_words = 200, color = "darkblue", random_order = FALSE)
dev.off()

# Creating a STM
stm.dfm <- convert(dfm_common, to = "stm")

stm.dfm$meta <- est_data

# Topic models
## Search for optimal topic model
search <- searchK(stm.dfm$documents, stm.dfm$vocab, 
                  K = c(2:30), prevalence = ~ s(year) + ws_cab_rescaled,
                  data = stm.dfm$meta)

search.results <- as.data.frame(search$results)

ggplot(search.results, aes(x=semcoh, y=exclus)) +
    geom_point(size=5, shape =1, color = "green") +
  geom_text(aes(label=K), size=2) +
   geom_smooth(method="lm", se = FALSE, color = "red", size = .3) +
  geom_vline(xintercept = mean(search.results$semcoh), size = .2, linetype="dashed") +
    geom_hline(yintercept = mean(search.results$exclus), size = .2, linetype="dashed") +
  theme_bw() +
  ggtitle("Selecting optimal number of topics") + 
  xlab("Semantic coherence") + ylab("Exclusivity")

ggsave("./plots/appendix_figure5.6-topic_search.pdf")


## Topic model estimates
topics18 <- stm(stm.dfm$documents, stm.dfm$vocab,  
                prevalence = ~ s(year) + ws_cab_rescaled, 
                data = stm.dfm$meta, 
                K = 18, init.type = "Spectral")

pdf("./plots/appendix_figure5.7-18topic_frex_words.pdf", width = 10, height = 7)

plot(topics18,type="summary", xlim = c(0, .4), n = 15, text.cex = .6, 
     labeltype = "frex", main = "Top 15 most important words (by FREX)")

dev.off()

#structural relationship in STM
con.eff <- estimateEffect( c(3) ~  s(year) + ws_cab_rescaled, 
                          topics18, meta = stm.dfm$meta, 
                          uncertainty = "Global")

pdf("./plots/appendix_figure5.8-18topic_topic3_wordscore_effect.pdf", width = 10, height = 7)

plot(con.eff, covariate = "ws_cab_rescaled", 
     model = topics18, method = "continuous",  
     main = "Effect of Wordscore position", 
     xlab = "Estimated Position")

abline(h=0,lty=1,lwd=.5,col="black")  

dev.off()



con.eff <- estimateEffect( c(7) ~  s(year) + ws_cab_rescaled, 
                          topics18, meta = stm.dfm$meta, 
                          uncertainty = "Global")

pdf("./plots/appendix_figure5.8-18topic_topic7_wordscore_effect.pdf", width = 10, height = 7)

plot(con.eff, covariate = "ws_cab_rescaled", 
     model = topics18, method = "continuous",  
     main = "Effect of Wordscore position", 
     xlab = "Estimated Position")

abline(h=0,lty=1,lwd=.5,col="black")  

dev.off()


con.eff <- estimateEffect( c(6) ~  s(year) + ws_cab_rescaled, 
                          topics18, meta = stm.dfm$meta, 
                          uncertainty = "Global")

pdf("./plots/appendix_figure5.8-18topic_topic6_wordscore_effect.pdf", width = 10, height = 7)

plot(con.eff, covariate = "ws_cab_rescaled", 
     model = topics18, method = "continuous",  
     main = "Effect of Wordscore position", 
     xlab = "Estimated Position")

abline(h=0,lty=1,lwd=.5,col="black")  

dev.off()

con.eff <- estimateEffect( c(9) ~  s(year) + ws_cab_rescaled, 
                          topics18, meta = stm.dfm$meta, 
                          uncertainty = "Global")

pdf("./plots/appendix_figure5.8-18topic_topic9_wordscore_effect.pdf", width = 10, height = 7)

plot(con.eff, covariate = "ws_cab_rescaled", 
     model = topics18, method = "continuous",  
     main = "Effect of Wordscore position", 
     xlab = "Estimated Position")

abline(h=0,lty=1,lwd=.5,col="black")  

dev.off()


con.eff <- estimateEffect( c(5) ~  s(year) + ws_cab_rescaled, 
                          topics18, meta = stm.dfm$meta, 
                          uncertainty = "Global")

pdf("./plots/appendix_figure5.8-18topic_topic5_wordscore_effect.pdf", width = 10, height = 7)

plot(con.eff, covariate = "ws_cab_rescaled", 
     model = topics18, method = "continuous",  
     main = "Effect of Wordscore position", 
     xlab = "Estimated Position")

abline(h=0,lty=1,lwd=.5,col="black")  

dev.off()


con.eff <- estimateEffect( c(12) ~  s(year) + ws_cab_rescaled, 
                          topics18, meta = stm.dfm$meta, 
                          uncertainty = "Global")

pdf("./plots/appendix_figure5.8-18topic_topic12_wordscore_effect.pdf", width = 10, height = 7)

plot(con.eff, covariate = "ws_cab_rescaled", 
     model = topics18, method = "continuous",  
     main = "Effect of Wordscore position", 
     xlab = "Estimated Position")

abline(h=0,lty=1,lwd=.5,col="black")  

dev.off()


con.eff <- estimateEffect( c(4) ~  s(year) + ws_cab_rescaled, 
                          topics18, meta = stm.dfm$meta, 
                          uncertainty = "Global")

pdf("./plots/appendix_figure5.8-18topic_topic4_wordscore_effect.pdf", width = 10, height = 7)

plot(con.eff, covariate = "ws_cab_rescaled", 
     model = topics18, method = "continuous",  
     main = "Effect of Wordscore position", 
     xlab = "Estimated Position")

abline(h=0,lty=1,lwd=.5,col="black")  

dev.off()


documents <- tidy(topics18, matrix = "gamma", document_names = stm.dfm$meta$portfolio)

sums <- documents %>%
group_by(document, topic) %>%
summarise(Topic_Proportion = sum(gamma)) %>% arrange(topic, Topic_Proportion) 

ggplot(sums, aes(x = reorder(document, Topic_Proportion), 
                 y = Topic_Proportion, fill = factor(topic))) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free") +
    coord_flip() + labs(x="", y="")

ggsave("./plots/appendix_figure5.9-18topic_topic_proportions.pdf", width = 12, height = 10)


